Contents:

Load necessary packages

Load Cytokine data and clinial information

The data we will use were collected from individuals with and without Down sydrome. The samples are from blood plasma and cytokine data were generated for each individual.

# define the file paths to the input data
htp_meta_data_file <- here("2025/data", "HTP_Metadata_v0.5_Synapse.txt") 
htp_cytokines_data_file <- here("2025/data", "HTP_MSD_Cytokines_Synapse.txt") 

# Other parameters
# standard_colors <- c("Group1" = "#F8766D", "Group2" = "#00BFC4")
standard_colors <- c("Control" = "gray60", "T21" = "#009b4e")
out_file_prefix <- "linear_regression_htp_cytokines_v0.1_"
# End required parameters ###


# 1 Read in and inspect data ----
## 1.1 Read in meta data ----
htp_meta_data <- htp_meta_data_file |> 
  read_tsv() |> 
  mutate(
    Karyotype = fct_relevel(Karyotype, c("Control", "T21")), # convert to factor and set order
    Sex = fct_relevel(Sex, "Female"), # convert to factor and set order
    Sample_source_code = as_factor(Sample_source_code) # convert to factor - default is numerical order
  )
# inspect
htp_meta_data
## # A tibble: 1,010 × 14
##    ParticipantID LabID     Karyotype Sex      Age Weight_kg Height_cm
##    <chr>         <chr>     <fct>     <fct>  <dbl>     <dbl>     <dbl>
##  1 HTP0274       HTP0274A  T21       Male    17.5      45.3      157 
##  2 HTP0591       HTP0591A  T21       Male    31.9      72.8      154.
##  3 HTP0330       HTP0330A  T21       Male     6.1      16.8      102.
##  4 HTP0278       HTP0278A  T21       Male     7.6      22.5       NA 
##  5 HTP0052       HTP0052A  T21       Female  28.7      51.3      133.
##  6 HTP0052       HTP0052A2 T21       Female  29.9      50.9      134.
##  7 HTP0052       HTP0052A3 T21       Female  31        49.7      133 
##  8 HTP0678       HTP0678B  Control   Female  17.1      45        155 
##  9 HTP0709       HTP0709B  Control   Female  29.1      58.4      160.
## 10 HTP0236       HTP0236A  T21       Male    10.4      NA         NA 
## # ℹ 1,000 more rows
## # ℹ 7 more variables: Sample_source_code <fct>, Event_name <chr>, Race <chr>,
## #   Ethnicity <chr>, Data_contact <chr>, Date_exported <chr>, Script <chr>
htp_meta_data |> skimr::skim()
Data summary
Name htp_meta_data
Number of rows 1010
Number of columns 14
_______________________
Column type frequency:
character 8
factor 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ParticipantID 0 1.00 7 7 0 683 0
LabID 0 1.00 8 10 0 1010 0
Event_name 0 1.00 7 8 0 10 0
Race 8 0.99 5 41 0 8 0
Ethnicity 8 0.99 6 22 0 4 0
Data_contact 0 1.00 32 32 0 1 0
Date_exported 0 1.00 8 8 0 1 0
Script 0 1.00 23 23 0 1 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Karyotype 0 1 FALSE 2 T21: 641, Con: 369
Sex 0 1 FALSE 2 Fem: 555, Mal: 455
Sample_source_code 0 1 FALSE 9 1: 542, 6: 161, 3: 134, 2: 74

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1.00 27.63 15.13 0.5 17.00 26.6 36.0 82.5 ▅▇▃▂▁
Weight_kg 169 0.83 64.70 25.11 6.7 51.60 65.8 79.7 149.6 ▂▆▇▂▁
Height_cm 175 0.83 152.25 21.91 64.0 145.25 155.0 166.0 196.0 ▁▁▂▇▂
#
here("2025/data", "HTP_Metadata_v0.5_dictionary.txt") |> read_tsv()
## # A tibble: 14 × 3
##    Variable           Example                 Description                       
##    <chr>              <chr>                   <chr>                             
##  1 ParticipantID      HTP0274                 Human Trisome Project participant…
##  2 LabID              HTP0274A                Human Trisome Project sample/bios…
##  3 Karyotype          T21                     Trisomy 21 status; trisomy 21/Dow…
##  4 Sex                Female                  Biological sex                    
##  5 Age                17.5                    Age at blood draw in years        
##  6 Weight_kg          45.3                    Weight at blood draw in kilograms 
##  7 Height_cm          157                     Height at blood draw in centimetr…
##  8 Sample_source_code 6                       Anonymous collection site identif…
##  9 Event_name         Visit 1                 Longitudinal visit/blood draw ide…
## 10 Race               White                   Self-reported race                
## 11 Ethnicity          Not Hispanic or Latino  Self-reported ethnicity           
## 12 Data_contact       email address           Contact for queries about data set
## 13 Date_exported      06012022                Date of data set preparation/expo…
## 14 Script             HTP_data_Synapse_v0.1.R Data preparation script
#

## 1.2 Read in abundance data ----
htp_cytokines_data <- htp_cytokines_data_file |> 
  read_tsv()
  # janitor::clean_names(case = "none")
# inspect
htp_cytokines_data # 25,758 rows
## # A tibble: 25,758 × 11
##    LabID    Sample_type Platform Batch Analyte     Units   Value N_imputed_wells
##    <chr>    <chr>       <chr>    <dbl> <chr>       <chr>   <dbl>           <dbl>
##  1 HTP0001B Plasma      MSD          3 CRP         pg/mL 3.22e+6               0
##  2 HTP0001B Plasma      MSD          3 Eotaxin     pg/mL 7.81e+1               0
##  3 HTP0001B Plasma      MSD          3 Eotaxin-3   pg/mL 1.20e+1               0
##  4 HTP0001B Plasma      MSD          3 FGF (basic) pg/mL 9.43e+0               0
##  5 HTP0001B Plasma      MSD          3 GM-CSF      pg/mL 3.19e-1               0
##  6 HTP0001B Plasma      MSD          3 ICAM-1      pg/mL 2.31e+5               0
##  7 HTP0001B Plasma      MSD          3 IFN-alpha2a pg/mL 9.29e-1               0
##  8 HTP0001B Plasma      MSD          3 IFN-beta    pg/mL 1.10e+1               2
##  9 HTP0001B Plasma      MSD          3 IFN-gamma   pg/mL 2.28e+0               0
## 10 HTP0001B Plasma      MSD          3 IL-10       pg/mL 3.06e-1               0
## # ℹ 25,748 more rows
## # ℹ 3 more variables: Data_contact <chr>, Date_exported <chr>, Script <chr>
htp_cytokines_data |> skimr::skim()
Data summary
Name htp_cytokines_data
Number of rows 25758
Number of columns 11
_______________________
Column type frequency:
character 8
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
LabID 0 1 8 9 0 477 0
Sample_type 0 1 6 6 0 1 0
Platform 0 1 3 3 0 1 0
Analyte 0 1 3 14 0 54 0
Units 0 1 5 5 0 1 0
Data_contact 0 1 32 32 0 1 0
Date_exported 0 1 8 8 0 1 0
Script 0 1 23 23 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Batch 0 1 4.98 1.08 3 4.00 5.00 6.00 6 ▃▂▁▆▇
Value 0 1 392846.59 5960600.07 0 0.61 5.22 106.11 380639855 ▇▁▁▁▁
N_imputed_wells 0 1 0.19 0.54 0 0.00 0.00 0.00 4 ▇▁▁▁▁
htp_cytokines_data |> distinct(Analyte) # 54 Analytes
## # A tibble: 54 × 1
##    Analyte    
##    <chr>      
##  1 CRP        
##  2 Eotaxin    
##  3 Eotaxin-3  
##  4 FGF (basic)
##  5 GM-CSF     
##  6 ICAM-1     
##  7 IFN-alpha2a
##  8 IFN-beta   
##  9 IFN-gamma  
## 10 IL-10      
## # ℹ 44 more rows
htp_cytokines_data |> distinct(LabID) # 477 LabIDs
## # A tibble: 477 × 1
##    LabID    
##    <chr>    
##  1 HTP0001B 
##  2 HTP0003A 
##  3 HTP0005A 
##  4 HTP0008B 
##  5 HTP0012A 
##  6 HTP0015A 
##  7 HTP0017A 
##  8 HTP0018B 
##  9 HTP0019B2
## 10 HTP0020A 
## # ℹ 467 more rows
#
here("2025/data", "HTP_MSD_Cytokines_dictionary.txt") |> read_tsv()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## # A tibble: 11 × 3
##    Variable        Example                             Description              
##    <chr>           <chr>                               <chr>                    
##  1 LabID           HTP0677B                            Human Trisome Project sa…
##  2 Sample_type     Plasma                              Type of sample taken/ext…
##  3 Platform        MSD                                 Description of assay pla…
##  4 Batch           Sample processing/measurement batch <NA>                     
##  5 Analyte         FGF (basic)                         Analyte name             
##  6 Units           pg/mL                               Description of measureme…
##  7 Value           1.319083834                         Measurement value        
##  8 N_imputed_wells 1                                   Number of out-of-range w…
##  9 Data_contact    email address                       Contact for queries abou…
## 10 Date_exported   11102021                            Date of data set prepara…
## 11 Script          HTP_data_Synapse_v0.1.R             Data preparation script
#

## 1.3 Join meta data with data type 1 and data type 2 ----
htp_meta_cytokines_data <- htp_cytokines_data |> 
  inner_join(htp_meta_data, by="LabID")
# check number of rows returned !!!


# 2 Data exploration  ----
## 2.1 basic check of data distribution(s) ----
htp_meta_cytokines_data |> 
  filter(Analyte == "CRP") |> 
  ggplot(aes(Karyotype, log2(Value), color = Karyotype)) +
  geom_boxplot()

#

Principal Component Analysis

We will use PCA to plot the data and explore sample information. The examples used here is taken from Statquest. Please refer to the PCA video.

Here is a useful post about PCA and how to think about the relationship of the components to variation.

We will also be using this image to illustrate the regression line fit regression fit

#create a data.frame of individuals by cytokines
cytokines_df <- as.data.frame(pivot_wider(htp_meta_cytokines_data, names_from = "Analyte", values_from = "Value", id_cols = "LabID"))
row.names(cytokines_df) <- cytokines_df$LabID
cytokines_df <- log2(cytokines_df[,-1])

# extract the annotations for each of the samples
pca_annos <- as.data.frame(htp_meta_data %>% filter(LabID %in% row.names(cytokines_df)))
row.names(pca_annos) <- pca_annos$LabID
pca_annos <- pca_annos[row.names(cytokines_df),]

# create a dataframe that will be used for plots that contains the cytokine and patient information.
cytokines_df_annos <- cbind(cytokines_df, pca_annos)

# Sanity check plots and basic plotting functions
# 1D
ids <- pca_annos$LabID[grep("T21", pca_annos$Karyotype)]
plot(cytokines_df[ids,]$`IFN-gamma`, rep(0,length(cytokines_df[ids,]$`IFN-gamma`)), pch=20, ylab="", xlab="IFN-gamma expression", cex=1, col="lightseagreen")
ids <- pca_annos$LabID[grep("Control", pca_annos$Karyotype)]
points(cytokines_df[ids,]$`IFN-gamma`, rep(0.5,length(cytokines_df[ids,]$`IFN-gamma`)), pch=20, col="salmon", cex=1)
legend(5, 1,  fill=c("lightseagreen", "salmon"), legend=c("T21", "Control"))

ggplot(cytokines_df_annos, aes(x=Karyotype, y=`IFN-gamma`, fill=Karyotype)) + geom_boxplot() + geom_jitter() + theme_bw()

# 2D
ggplot(cytokines_df_annos, aes(x=`IFN-gamma`, y=`TNF-alpha`, color=Karyotype)) + geom_point()  + theme_bw()

# lets look at a simplified example
Cytokine1 <- c(10,11,8,3,1,3)
Cytokine2 <- c(5,4,5,3,3,1)
plot(Cytokine1, Cytokine2, pch =19)

# find the mean of Cytokine1 and Cytokine2
plot(Cytokine1, Cytokine2, pch =19)
points(mean(Cytokine1), mean(Cytokine2), col="purple", pch=19, cex=3)  

# center the data and plot
Cytokine1 = Cytokine1 - mean(Cytokine1)
Cytokine2 = Cytokine2 - mean(Cytokine2)
plot(Cytokine1, Cytokine2, pch =19)
segments(0,-10,0,20, col="grey", lty=2)
segments(-10,0,20,0, col="grey", lty=2)

# add a regression line
plot(Cytokine1, Cytokine2, pch =19)
segments(0,-10,0,20, col="grey", lty=2)
segments(-10,0,20,0, col="grey", lty=2)
abline(lm(Cytokine2~Cytokine1))
segments(0,0,4,0, col="red", lwd=2)
segments(4,1,4,0, col="red", lwd=2)

# back to the HTP data
# find the mean of the X and Y directions
ggplot(cytokines_df_annos, aes(x=`IFN-gamma`, y=`TNF-alpha`)) + geom_point()  + theme_bw() +
   geom_point(aes(x=mean(cytokines_df_annos$`IFN-gamma`),y=mean(cytokines_df_annos$`TNF-alpha`)), colour="purple", size=5)
## Warning: Use of `` cytokines_df_annos$`IFN-gamma` `` is discouraged.
## ℹ Use `IFN-gamma` instead.
## Warning: Use of `` cytokines_df_annos$`TNF-alpha` `` is discouraged.
## ℹ Use `TNF-alpha` instead.
## Warning in geom_point(aes(x = mean(cytokines_df_annos$`IFN-gamma`), y = mean(cytokines_df_annos$`TNF-alpha`)), : All aesthetics have length 1, but the data has 477 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# center the data and plot
cytokines_df_annos$`IFN-gamma-standard` <- cytokines_df_annos$`IFN-gamma` - mean(cytokines_df_annos$`IFN-gamma`)
cytokines_df_annos$`TNF-alpha-standard` <- cytokines_df_annos$`TNF-alpha` - mean(cytokines_df_annos$`TNF-alpha`)
ggplot(cytokines_df_annos, aes(x=`IFN-gamma-standard`, y=`TNF-alpha-standard`)) + geom_point()  + theme_bw()

# add a regression line
ggplot(cytokines_df_annos, aes(x=`IFN-gamma-standard`, y=`TNF-alpha-standard`)) + geom_point()  + theme_bw() + geom_smooth(method='lm',se=F) + geom_point(aes(x=0,y=0), colour="red")
## Warning in geom_point(aes(x = 0, y = 0), colour = "red"): All aesthetics have length 1, but the data has 477 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# note in the prcomp implementation of PCA, 
# x = PCs
# rotation = loadings
# sdev^2 = eigenvalues

# PCA with HTP Cytokine data
pca <- prcomp(na.omit(cytokines_df), scale=T)

autoplot(pca, data=cytokines_df_annos, col='Karyotype')

autoplot(pca, data=cytokines_df_annos, col='Sex')

# remove outlier samples
hist(pca$x[,1], main ="PC1")

sort(pca$x[,1])
##      HTP0553A      HTP0572A      HTP0555B      HTP0565A      HTP0664A 
## -14.783670481 -14.673212866 -14.617885153 -13.591652177 -13.209987277 
##      HTP0589A      HTP0145B      HTP0621A      HTP0509B      HTP0538A 
## -12.333150731  -5.990444868  -5.970171387  -5.069048137  -4.769888832 
##      HTP0482A      HTP0117B      HTP0502B      HTP0642A      HTP0146B 
##  -4.704099037  -4.699243544  -4.476196024  -4.434195106  -4.415890687 
##      HTP0290B      HTP0639A      HTP0605A      HTP0582B      HTP0123A 
##  -4.311537801  -4.294316587  -4.283314312  -4.106288440  -4.083947213 
##      HTP0655B      HTP0669B      HTP0377B      HTP0629A      HTP0417B 
##  -4.030182722  -4.022421241  -3.918951995  -3.916210685  -3.875443975 
##      HTP0625A      HTP0503B      HTP0504B      HTP0675B      HTP0114B 
##  -3.716101377  -3.656992200  -3.588777540  -3.566045731  -3.527727428 
##      HTP0617A      HTP0594A      HTP0654A      HTP0547A     HTP0249A2 
##  -3.475964438  -3.428644965  -3.400758063  -3.350467090  -3.274531607 
##      HTP0375A      HTP0511B     HTP0316B3      HTP0622A      HTP0111B 
##  -3.266921605  -3.227866627  -3.205663439  -3.191586005  -3.190980606 
##      HTP0577A      HTP0578A      HTP0649B      HTP0623A      HTP0466B 
##  -3.180528434  -3.158894488  -3.052950424  -2.965812759  -2.961889262 
##      HTP0539A     HTP0322B2      HTP0626A      HTP0523B      HTP0372B 
##  -2.899828932  -2.888030808  -2.869612965  -2.866625070  -2.842191964 
##      HTP0447B      HTP0559B      HTP0524B      HTP0371B      HTP0677B 
##  -2.841129944  -2.820370009  -2.818396870  -2.810166234  -2.794497637 
##      HTP0676A      HTP0526B      HTP0645A      HTP0659A      HTP0278A 
##  -2.772060743  -2.725999338  -2.724721976  -2.720770577  -2.654511334 
##      HTP0474A      HTP0353B      HTP0496B      HTP0631A      HTP0413A 
##  -2.617633814  -2.613995468  -2.589618988  -2.561048625  -2.551169626 
##      HTP0039B      HTP0568A      HTP0606A      HTP0630A      HTP0624A 
##  -2.529701590  -2.529665426  -2.527911236  -2.521187843  -2.503016719 
##      HTP0289B      HTP0602A      HTP0636A      HTP0613A      HTP0149B 
##  -2.449139927  -2.434866842  -2.434827031  -2.432194280  -2.360598786 
##      HTP0541A      HTP0641A      HTP0569A      HTP0637A      HTP0540A 
##  -2.315607044  -2.307238410  -2.264758323  -2.249796697  -2.233773940 
##      HTP0529B      HTP0398A      HTP0031B      HTP0049B      HTP0459A 
##  -2.223288207  -2.217682683  -2.216009253  -2.204009291  -2.199059784 
##      HTP0546A      HTP0592A      HTP0646A      HTP0590A      HTP0661A 
##  -2.184452048  -2.181841753  -2.166110403  -2.164716875  -2.143387526 
##      HTP0634A      HTP0100B      HTP0588A      HTP0494A      HTP0391A 
##  -2.140755163  -2.136305030  -2.123282431  -2.096654856  -2.085542025 
##      HTP0057B      HTP0518A      HTP0580A      HTP0662A      HTP0346B 
##  -2.071378670  -2.061027587  -2.048353442  -2.016094800  -1.999841582 
##      HTP0573B      HTP0407A      HTP0396A      HTP0666A      HTP0615B 
##  -1.990788692  -1.977907083  -1.938521222  -1.909038268  -1.878877173 
##      HTP0047B      HTP0653B      HTP0408A      HTP0506B      HTP0343B 
##  -1.856159029  -1.850681903  -1.818162902  -1.789080461  -1.780480567 
##      HTP0652A      HTP0575A      HTP0673B      HTP0554A      HTP0347B 
##  -1.766222066  -1.731981097  -1.700032057  -1.693034725  -1.664183394 
##      HTP0369A      HTP0607A      HTP0116A      HTP0411A      HTP0436A 
##  -1.640622925  -1.618870994  -1.614636859  -1.592680864  -1.592093765 
##      HTP0477A      HTP0534A      HTP0583A      HTP0048B      HTP0551B 
##  -1.589052095  -1.582530258  -1.574116265  -1.571693009  -1.560727894 
##      HTP0603A      HTP0441A      HTP0431A      HTP0608A      HTP0550A 
##  -1.553689241  -1.551307963  -1.546379343  -1.516347789  -1.499583978 
##      HTP0567B      HTP0055B      HTP0581A      HTP0203A      HTP0672A 
##  -1.480522457  -1.468620028  -1.459936612  -1.439285986  -1.423884749 
##      HTP0528B      HTP0560A      HTP0433A      HTP0498B      HTP0651A 
##  -1.420901936  -1.406195530  -1.395875662  -1.380426004  -1.379221644 
##     HTP0241A2      HTP0527B      HTP0454A      HTP0367A      HTP0455B 
##  -1.373774111  -1.354164001  -1.351510939  -1.335173009  -1.292173077 
##      HTP0644A      HTP0060B      HTP0127B     HTP0247A2     HTP0219A2 
##  -1.291787417  -1.290964935  -1.286943839  -1.281962286  -1.276650958 
##      HTP0383A      HTP0633A      HTP0457B      HTP0423A      HTP0307B 
##  -1.253829586  -1.236236659  -1.230513693  -1.224285706  -1.222340531 
##      HTP0535A      HTP0557B      HTP0032A      HTP0376B      HTP0549A 
##  -1.216716889  -1.215812749  -1.198837112  -1.192677593  -1.182545550 
##     HTP0236A2      HTP0390A      HTP0041B      HTP0616A      HTP0584A 
##  -1.165320301  -1.135649515  -1.130714911  -1.130687198  -1.128305563 
##      HTP0473A     HTP0238A2      HTP0593A      HTP0110B      HTP0043B 
##  -1.127766324  -1.127202691  -1.098517709  -1.094264681  -1.088052924 
##      HTP0035B      HTP0280B      HTP0279B      HTP0008B      HTP0491A 
##  -1.083533490  -1.078190857  -1.056596112  -1.032782721  -1.003564498 
##      HTP0493B      HTP0663A      HTP0078B     HTP0277A2      HTP0044B 
##  -0.975868640  -0.966769945  -0.953203739  -0.900439563  -0.890701829 
##      HTP0632A     HTP0315A3      HTP0660A     HTP0263A2      HTP0001B 
##  -0.883341472  -0.875596680  -0.875506515  -0.860242865  -0.853838125 
##      HTP0638A      HTP0612A     HTP0328A2      HTP0324A      HTP0604A 
##  -0.853277956  -0.842644144  -0.831966722  -0.804723076  -0.783441481 
##      HTP0461A      HTP0591A      HTP0340A      HTP0513B      HTP0358B 
##  -0.779929518  -0.767851209  -0.755999913  -0.745228535  -0.730305855 
##      HTP0486A      HTP0475A     HTP0444B2      HTP0657A      HTP0548A 
##  -0.700460682  -0.699271813  -0.677516767  -0.662710030  -0.647853106 
##      HTP0643A      HTP0452A      HTP0098B      HTP0062B      HTP0349B 
##  -0.647714020  -0.639780480  -0.637709861  -0.630754620  -0.629257134 
##      HTP0388A      HTP0293A      HTP0618A      HTP0381A      HTP0512A 
##  -0.628620106  -0.611391908  -0.604912679  -0.594802058  -0.570377126 
##      HTP0330A      HTP0446A     HTP0259A2     HTP0314B2     HTP0326A3 
##  -0.569966294  -0.557552667  -0.550965590  -0.531754063  -0.530885834 
##      HTP0025A      HTP0650A      HTP0040B      HTP0609A      HTP0595B 
##  -0.499976785  -0.486173327  -0.479225358  -0.448486679  -0.439530671 
##      HTP0483A      HTP0050A      HTP0409A      HTP0476A      HTP0564A 
##  -0.415659525  -0.408688530  -0.398300093  -0.375892584  -0.371059600 
##      HTP0574A      HTP0378A      HTP0468A      HTP0611A      HTP0667A 
##  -0.355273825  -0.354423094  -0.344609303  -0.322328726  -0.302406321 
##      HTP0427A      HTP0384A      HTP0467A      HTP0086B      HTP0464A 
##  -0.298032948  -0.284110178  -0.278252131  -0.275738762  -0.272781746 
##      HTP0532A      HTP0387A      HTP0516A      HTP0596A      HTP0537A 
##  -0.249468073  -0.248587081  -0.230923488  -0.226531937  -0.214243630 
##     HTP0125B2      HTP0576B      HTP0521B      HTP0544A      HTP0362A 
##  -0.207135159  -0.203214407  -0.172550175  -0.167641159  -0.167490405 
##      HTP0426A      HTP0456A      HTP0061B      HTP0485A      HTP0058B 
##  -0.116600566  -0.091602978  -0.070986592  -0.042940134  -0.042392711 
##      HTP0561A      HTP0298A      HTP0570A      HTP0416A      HTP0026B 
##  -0.015154241  -0.008285802   0.032712400   0.040708630   0.040811914 
##      HTP0458A      HTP0665A      HTP0354A      HTP0598A      HTP0585A 
##   0.044454688   0.046490266   0.075253773   0.087911680   0.111858209 
##     HTP0261A2      HTP0418A      HTP0562A      HTP0318A      HTP0412A 
##   0.127736592   0.157823783   0.174057822   0.178282830   0.184043023 
##      HTP0640A      HTP0620A      HTP0619A      HTP0345A      HTP0571A 
##   0.187788354   0.193205431   0.195461120   0.255848309   0.264313463 
##      HTP0030A      HTP0479A      HTP0668A     HTP0262A2      HTP0597A 
##   0.267197802   0.292337720   0.314890883   0.344156184   0.347884390 
##      HTP0022B      HTP0294B      HTP0484A      HTP0439A      HTP0471A 
##   0.372246482   0.379214531   0.388912954   0.389648558   0.399410696 
##      HTP0059B      HTP0341A     HTP0361B2      HTP0543A      HTP0517A 
##   0.403193542   0.409038219   0.434809225   0.444570270   0.449057294 
##      HTP0053A      HTP0449A      HTP0395A      HTP0670A      HTP0379A 
##   0.477232850   0.503161625   0.526819465   0.532769854   0.544474915 
##      HTP0104B      HTP0321A      HTP0017A      HTP0415A      HTP0406A 
##   0.549153959   0.553817562   0.616654507   0.635759508   0.645043007 
##      HTP0579A      HTP0443A      HTP0400A      HTP0488A      HTP0414A 
##   0.662733753   0.694425604   0.703994646   0.707772548   0.764729650 
##      HTP0357A      HTP0380A      HTP0648B      HTP0092B      HTP0674B 
##   0.769347270   0.782613410   0.783900853   0.793472433   0.811929438 
##      HTP0424A      HTP0587A      HTP0147B      HTP0628B      HTP0365A 
##   0.822063729   0.824906965   0.862969741   0.872659704   0.881317390 
##      HTP0448A      HTP0089B     HTP0297A6      HTP0531A      HTP0487A 
##   0.896211344   0.918433247   0.956629861   0.956711632   0.976802603 
##      HTP0422A      HTP0492B      HTP0045A     HTP0251A2      HTP0469A 
##   0.977266780   0.987193256   1.036940798   1.070316183   1.085756705 
##      HTP0070A      HTP0198A      HTP0118B      HTP0325A      HTP0374A 
##   1.087152009   1.097726465   1.129523634   1.132477943   1.146705638 
##      HTP0463A      HTP0533B      HTP0148B      HTP0635A      HTP0079B 
##   1.171907124   1.176962459   1.178226927   1.180123801   1.190024735 
##      HTP0599A      HTP0556A      HTP0419A     HTP0303A2      HTP0542A 
##   1.202147485   1.285951255   1.290406070   1.298628636   1.317590139 
##      HTP0382A      HTP0450A      HTP0460A      HTP0480A      HTP0403A 
##   1.325898812   1.328568975   1.330398563   1.348059825   1.363234546 
##      HTP0563A     HTP0213A2      HTP0514A      HTP0368A      HTP0342A 
##   1.390507029   1.421625111   1.452971188   1.465134116   1.483614187 
##      HTP0420A      HTP0425A      HTP0003A      HTP0453A      HTP0445A 
##   1.484181631   1.497356611   1.519291772   1.553256334   1.579105895 
##      HTP0429B      HTP0428A      HTP0495B      HTP0530A      HTP0207A 
##   1.582163580   1.591240789   1.612996229   1.623159427   1.638911706 
##      HTP0402A      HTP0054A      HTP0027A      HTP0397A      HTP0438A 
##   1.645284930   1.648716371   1.649252885   1.710989834   1.717424180 
##      HTP0066A      HTP0389A      HTP0082B      HTP0394A      HTP0434A 
##   1.809557232   1.810591914   1.829420811   1.839997248   1.875099289 
##      HTP0101A     HTP0019B2      HTP0586A      HTP0515A      HTP0052A 
##   1.889660532   1.906089578   1.954783993   1.979196428   2.006058831 
##      HTP0385A      HTP0421A      HTP0090B      HTP0091B      HTP0647A 
##   2.023887263   2.076152722   2.080760574   2.084511621   2.091873071 
##      HTP0405A      HTP0020A      HTP0430A      HTP0352B      HTP0437A 
##   2.103968796   2.107160676   2.108632819   2.111092815   2.133441540 
##      HTP0545A     HTP0230A2      HTP0339A      HTP0087B      HTP0351A 
##   2.173090054   2.202269944   2.204272209   2.209880936   2.221788389 
##      HTP0119A      HTP0410A      HTP0478A      HTP0600A      HTP0627A 
##   2.232438627   2.240629933   2.282388820   2.315834029   2.371260016 
##      HTP0323A     HTP0226A2      HTP0399A      HTP0350A      HTP0601A 
##   2.374619911   2.384484484   2.423811985   2.428354154   2.447567808 
##      HTP0301A      HTP0291A      HTP0442A      HTP0152B      HTP0472A 
##   2.511123913   2.538504488   2.574917743   2.598031094   2.610900657 
##      HTP0205B      HTP0005A      HTP0507A      HTP0566A      HTP0105B 
##   2.620089105   2.645427183   2.648441021   2.662973921   2.682958370 
##      HTP0392A      HTP0329A      HTP0042B      HTP0126B      HTP0435A 
##   2.718986931   2.731070896   2.784889346   2.817243397   2.820595793 
##      HTP0109A      HTP0510A     HTP0195A2      HTP0296A      HTP0364A 
##   2.835167943   2.857973625   2.871710731   2.880778815   2.960967497 
##      HTP0073A      HTP0084A      HTP0107A      HTP0096B      HTP0440A 
##   2.964091929   2.997863643   3.052568212   3.062302194   3.090799806 
##      HTP0373A      HTP0432A      HTP0077B      HTP0056B      HTP0335A 
##   3.171763065   3.171784535   3.231464927   3.280360774   3.287150711 
##      HTP0083B      HTP0370A      HTP0151A      HTP0355B      HTP0012A 
##   3.351579593   3.409620114   3.427925418   3.586795124   3.617092615 
##      HTP0658A      HTP0470A      HTP0112A      HTP0401A      HTP0300A 
##   3.648862263   3.710620870   3.734998011   3.781285188   3.804422671 
##      HTP0337A      HTP0552A      HTP0451A      HTP0525A      HTP0462A 
##   3.833055128   4.008340600   4.029379911   4.036022446   4.083395121 
##      HTP0490A      HTP0080A      HTP0295A      HTP0558A      HTP0386A 
##   4.112996609   4.115460796   4.155816120   4.284132485   4.396789204 
##      HTP0065A      HTP0094B      HTP0115A      HTP0023A      HTP0465A 
##   4.534717753   4.566802497   4.584942290   4.620604484   4.628785396 
##      HTP0481B      HTP0034B      HTP0306A      HTP0320A      HTP0102B 
##   4.669996879   4.707714652   4.712407498   4.713731695   4.729215024 
##      HTP0093A      HTP0520B      HTP0331A      HTP0018B      HTP0015A 
##   4.749221257   4.768507553   4.797925323   4.893826130   5.042426152 
##      HTP0519A      HTP0095B      HTP0333A      HTP0501A      HTP0610A 
##   5.160414711   5.178340407   5.192708426   5.513095643   5.656085043 
##      HTP0393A      HTP0036A      HTP0336A      HTP0344A      HTP0319A 
##   6.162472189   6.193411542   6.249835190   6.328707922   6.913102237 
##      HTP0103A      HTP0404A      HTP0334A      HTP0327A      HTP0360A 
##   7.074180874   7.747392735   7.749683478   8.134269425   8.144170131 
##      HTP0081A      HTP0522A 
##   8.579780765   9.948338175
row.names(cytokines_df)[pca$x[,1] < -10]
## [1] "HTP0553A" "HTP0555B" "HTP0565A" "HTP0572A" "HTP0589A" "HTP0664A"
cytokines_df <- cytokines_df[pca$x[,1] > -10,]
cytokines_df_annos <- cytokines_df_annos[pca$x[,1] > -10,]
dim(cytokines_df)
## [1] 471  54
dim(cytokines_df_annos)
## [1] 471  70
pca <- prcomp(na.omit(cytokines_df), scale=T)
autoplot(pca, data=cytokines_df_annos, col='Karyotype')

autoplot(pca, data=cytokines_df_annos, col='Sex')

autoplot(pca, data=cytokines_df_annos, col='Age')

 # scree plot
var_explained = cbind(PC=seq(1, length(pca$sdev)), var=100*(pca$sdev^2 / sum(pca$sdev^2)))
ggplot(var_explained, aes(x=PC, y=var)) + geom_line() + xlab("Principal Component") + theme_bw() +
  ylab("Variance Explained (%)") + ggtitle("Scree Plot") + xlim(1,20)
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_line()`).

UMAP: Uniform Manifold Approximation and Projection

We will use UMAP to plot the data and explore sample information. Bookmark the Statquest video on UMAP to review again and again in the future.

Here is a useful post that helps to undetstand the relationship between the parameters and visualize in UMAP.

# UMAP with HTP Cytokine data
u <- umap(cytokines_df)
cytokines_df_annos_umap <- cbind(u$layout, cytokines_df_annos)
colnames(cytokines_df_annos_umap)[1] <- "UMAP1"
colnames(cytokines_df_annos_umap)[2] <- "UMAP2"

ggplot(cytokines_df_annos_umap, aes(x=UMAP1, y=UMAP2, color=Karyotype)) + geom_point()  + theme_bw()   

# to explore the parameters in UMAP, you can see the default values and adjust in the function
umap.defaults

u <- umap(cytokines_df, n_neighbors=5)
cytokines_df_annos_umap <- cbind(u$layout, cytokines_df_annos)
colnames(cytokines_df_annos_umap)[1] <- "UMAP1"
colnames(cytokines_df_annos_umap)[2] <- "UMAP2"

ggplot(cytokines_df_annos_umap, aes(x=UMAP1, y=UMAP2, color=Karyotype)) + geom_point()  + theme_bw()   

t-distributed stochastic neighbor embedding

We will use tSNE to plot the data and explore sample information

# tSNE is stochastic so it will produce different results based on the random seed. To get the same results, you will need to fix the seed
set.seed(48673)

# theta is parameter that balances speed and accuracy. theta=0 is the exact tSNE calculation
# perplexity is the value that balances density of the cluster size

# tSNE with HTP Cytokine data
tsne <- Rtsne(cytokines_df, pca=F, verbose=T, perplexity=30, theta=0)
## Read the 471 x 54 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.000000
## Computing input similarities...
## Symmetrizing...
## Done in 0.02 seconds!
## Learning embedding...
## Iteration 50: error is 60.418804 (50 iterations in 0.07 seconds)
## Iteration 100: error is 60.228530 (50 iterations in 0.07 seconds)
## Iteration 150: error is 60.214475 (50 iterations in 0.07 seconds)
## Iteration 200: error is 60.218201 (50 iterations in 0.07 seconds)
## Iteration 250: error is 60.229697 (50 iterations in 0.07 seconds)
## Iteration 300: error is 1.151056 (50 iterations in 0.06 seconds)
## Iteration 350: error is 1.074016 (50 iterations in 0.07 seconds)
## Iteration 400: error is 1.046168 (50 iterations in 0.06 seconds)
## Iteration 450: error is 1.038032 (50 iterations in 0.06 seconds)
## Iteration 500: error is 1.033382 (50 iterations in 0.06 seconds)
## Iteration 550: error is 1.031259 (50 iterations in 0.07 seconds)
## Iteration 600: error is 1.028828 (50 iterations in 0.07 seconds)
## Iteration 650: error is 1.024679 (50 iterations in 0.07 seconds)
## Iteration 700: error is 1.023494 (50 iterations in 0.08 seconds)
## Iteration 750: error is 1.022628 (50 iterations in 0.08 seconds)
## Iteration 800: error is 1.022129 (50 iterations in 0.07 seconds)
## Iteration 850: error is 1.021786 (50 iterations in 0.07 seconds)
## Iteration 900: error is 1.021109 (50 iterations in 0.07 seconds)
## Iteration 950: error is 1.020432 (50 iterations in 0.07 seconds)
## Iteration 1000: error is 1.019759 (50 iterations in 0.07 seconds)
## Fitting performed in 1.39 seconds.
cytokines_df_annos_tsne <- cbind(tsne$Y, cytokines_df_annos)
colnames(cytokines_df_annos_tsne)[1] <- "tsne1"
colnames(cytokines_df_annos_tsne)[2] <- "tsne2"

ggplot(cytokines_df_annos_tsne, aes(x=tsne1, y=tsne2, color=Karyotype)) + geom_point()  + theme_bw()   

ggplot(cytokines_df_annos_tsne, aes(x=tsne1, y=tsne2, color=Sex)) + geom_point()  + theme_bw()   

# playing with perplexity
tsne <- Rtsne(cytokines_df, pca=F, verbose=T, perplexity=5, theta=0)
## Read the 471 x 54 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.000000
## Computing input similarities...
## Symmetrizing...
## Done in 0.02 seconds!
## Learning embedding...
## Iteration 50: error is 79.949794 (50 iterations in 0.07 seconds)
## Iteration 100: error is 76.424634 (50 iterations in 0.08 seconds)
## Iteration 150: error is 75.864472 (50 iterations in 0.08 seconds)
## Iteration 200: error is 76.498432 (50 iterations in 0.08 seconds)
## Iteration 250: error is 77.159962 (50 iterations in 0.07 seconds)
## Iteration 300: error is 1.534669 (50 iterations in 0.08 seconds)
## Iteration 350: error is 1.306232 (50 iterations in 0.08 seconds)
## Iteration 400: error is 1.230411 (50 iterations in 0.07 seconds)
## Iteration 450: error is 1.192336 (50 iterations in 0.07 seconds)
## Iteration 500: error is 1.168750 (50 iterations in 0.07 seconds)
## Iteration 550: error is 1.158394 (50 iterations in 0.07 seconds)
## Iteration 600: error is 1.151216 (50 iterations in 0.07 seconds)
## Iteration 650: error is 1.145885 (50 iterations in 0.07 seconds)
## Iteration 700: error is 1.141815 (50 iterations in 0.07 seconds)
## Iteration 750: error is 1.138590 (50 iterations in 0.07 seconds)
## Iteration 800: error is 1.136004 (50 iterations in 0.07 seconds)
## Iteration 850: error is 1.133839 (50 iterations in 0.07 seconds)
## Iteration 900: error is 1.132043 (50 iterations in 0.08 seconds)
## Iteration 950: error is 1.130481 (50 iterations in 0.08 seconds)
## Iteration 1000: error is 1.129110 (50 iterations in 0.07 seconds)
## Fitting performed in 1.49 seconds.
cytokines_df_annos_tsne <- cbind(tsne$Y, cytokines_df_annos)
colnames(cytokines_df_annos_tsne)[1] <- "tsne1"
colnames(cytokines_df_annos_tsne)[2] <- "tsne2"
ggplot(cytokines_df_annos_tsne, aes(x=tsne1, y=tsne2, color=Karyotype)) + geom_point()  + theme_bw()   

tsne <- Rtsne(cytokines_df, pca=F, verbose=T, perplexity=100, theta=0)
## Read the 471 x 54 data matrix successfully!
## Using no_dims = 2, perplexity = 100.000000, and theta = 0.000000
## Computing input similarities...
## Symmetrizing...
## Done in 0.02 seconds!
## Learning embedding...
## Iteration 50: error is 46.447575 (50 iterations in 0.07 seconds)
## Iteration 100: error is 46.447790 (50 iterations in 0.07 seconds)
## Iteration 150: error is 46.443755 (50 iterations in 0.07 seconds)
## Iteration 200: error is 46.447459 (50 iterations in 0.07 seconds)
## Iteration 250: error is 46.447851 (50 iterations in 0.07 seconds)
## Iteration 300: error is 0.657454 (50 iterations in 0.07 seconds)
## Iteration 350: error is 0.630432 (50 iterations in 0.07 seconds)
## Iteration 400: error is 0.626774 (50 iterations in 0.07 seconds)
## Iteration 450: error is 0.626621 (50 iterations in 0.07 seconds)
## Iteration 500: error is 0.626348 (50 iterations in 0.07 seconds)
## Iteration 550: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 600: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 650: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 700: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 750: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 800: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 850: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 900: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 950: error is 0.626346 (50 iterations in 0.07 seconds)
## Iteration 1000: error is 0.626346 (50 iterations in 0.07 seconds)
## Fitting performed in 1.39 seconds.
cytokines_df_annos_tsne <- cbind(tsne$Y, cytokines_df_annos)
colnames(cytokines_df_annos_tsne)[1] <- "tsne1"
colnames(cytokines_df_annos_tsne)[2] <- "tsne2"
ggplot(cytokines_df_annos_tsne, aes(x=tsne1, y=tsne2, color=Karyotype)) + geom_point()  + theme_bw()   

Session Information

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] here_1.0.1       conflicted_1.2.0 patchwork_1.3.0  janitor_2.2.1   
##  [5] broom_1.0.7      skimr_2.1.5      tictoc_1.2.1     ggforce_0.5.0   
##  [9] ggrepel_0.9.6    rstatix_0.7.2    lubridate_1.9.4  forcats_1.0.0   
## [13] stringr_1.5.1    dplyr_1.1.4      purrr_1.0.4      readr_2.1.5     
## [17] tidyr_1.3.1      tibble_3.2.1     tidyverse_2.0.0  openxlsx_4.2.8  
## [21] readxl_1.4.5     cluster_2.1.8    limma_3.58.1     umap_0.2.10.0   
## [25] Rtsne_0.17       factoextra_1.0.7 ggfortify_0.4.17 ggplot2_3.5.1   
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3       rlang_1.1.5         magrittr_2.0.3     
##  [4] snakecase_0.11.1    compiler_4.3.3      mgcv_1.9-1         
##  [7] png_0.1-8           vctrs_0.6.5         pkgconfig_2.0.3    
## [10] crayon_1.5.3        fastmap_1.2.0       backports_1.5.0    
## [13] labeling_0.4.3      utf8_1.2.4          rmarkdown_2.29     
## [16] tzdb_0.4.0          bit_4.5.0.1         xfun_0.51          
## [19] cachem_1.1.0        jsonlite_1.9.1      tweenr_2.0.3       
## [22] parallel_4.3.3      R6_2.6.1            bslib_0.9.0        
## [25] stringi_1.8.4       reticulate_1.41.0.1 car_3.1-3          
## [28] jquerylib_0.1.4     cellranger_1.1.0    Rcpp_1.0.14        
## [31] knitr_1.50          base64enc_0.1-3     Matrix_1.6-5       
## [34] splines_4.3.3       timechange_0.3.0    tidyselect_1.2.1   
## [37] rstudioapi_0.17.1   abind_1.4-8         yaml_2.3.10        
## [40] lattice_0.22-6      withr_3.0.2         askpass_1.2.1      
## [43] evaluate_1.0.3      polyclip_1.10-7     zip_2.3.1          
## [46] pillar_1.10.1       carData_3.0-5       generics_0.1.3     
## [49] vroom_1.6.5         rprojroot_2.0.4     hms_1.1.3          
## [52] munsell_0.5.1       scales_1.3.0        glue_1.8.0         
## [55] tools_4.3.3         RSpectra_0.16-2     grid_4.3.3         
## [58] colorspace_2.1-1    nlme_3.1-166        repr_1.1.7         
## [61] Formula_1.2-5       cli_3.6.4           gtable_0.3.6       
## [64] sass_0.4.9          digest_0.6.37       farver_2.1.2       
## [67] memoise_2.0.1       htmltools_0.5.8.1   lifecycle_1.0.4    
## [70] statmod_1.5.0       openssl_2.3.2       bit64_4.6.0-1      
## [73] MASS_7.3-60.0.1